##############################################################################
#_______________________________ Depth Volume _______________________________#
##############################################################################

# This script produces figures and regression output to test the correlation
# between depth and volume on the Top 5 exchanges in 2015.
#
# The data used for this exercise is "Depth_Volume_Shares_Top5_2015.csv" dataset,
# which contains date-symbol-exchange level observations of depth and volume 
# shares (e.g. NYSE's share of AAPL volume traded on May 8, 2015 or NASDAQ's 
# share of depth for the first hour of the trading day, for MSFT on May 8, 2015).
#
# (Note that depth in this dataset is computed from the BBO among the Top 5
# exchanges).
#
# This dataset has already been merged with the "Symbol_Universe_2015" dataset
# and only contains observations for the sample symbols (flagged with "f_sample").
# We also use the "f_Sample_T100" flag from the symbol universe file to identify
# the Top 100 symbols and "f_NYSE_Listed" to confirm that all rows missing
# depth shares and volume shares are observations of NYSE for NYSE Listed stocks
#
# The output of the script is generated in the following order:
#
# 1. Depth shares and volume shares among the Top 5 Maker-Taker exchanges
#    for the Top 100 symbols
# 3. Output of regression between depth and volume shares among the Top 5
#    maker-taker exchanges (shares computed from the Top 5 exchanges, Top 100
#    symbols)
#    (a) point estimate of the coefficient
#    (b) R2 of the regression
#    (c) standard error of the coefficient (clustered by symbol and exchange)
#    (d) p-value of the t-test, where the null hypothesis is the coefficient
#        is equal to 1.
# 4. Output of symbol-level regressions between depth and volume shares
#    for each of the top 100 symbols (shares computed among Top 5 and Top 8
#    regressions)
#    (a) density plots of the coefficients estimates for each symbol
#    (b) density plots of the R2 for each symbol
#    (c) histograms of the coefficients estimates for each symbol
#    (b) histograms of the R2 for each symbol
#    (d) table counting for how many symbols can we reject the hypothesis that
#        the coefficient is equal to 1 at the 90%, 95%, and 99% levels.
#
# As a robustness check, we also perform the same exercise but with depth and
# volume shares computed at various time intervals including:
#   (1) Top 100 symbols, depth and volume shares at 5 minute level
#   (2) Top 100 symbols, depth and volume shares at 10 minute level
#   (3) Top 100 symbols, depth and volume shares at 30 minute level
#   (4) Top 100 symbols, depth and volume shares at 1 hour level
#   (5) SPY, depth and volume shares at 1s, 10s, 30s, 1min, 10min, 30min,
#       1 hour, 2 hour and 6 hour levels.

################################################################################
#_______________________________ Load Libraries _______________________________#
################################################################################

library(ggplot2)
library(plyr)
library(dplyr)
library(multiwayvcov)
library(data.table)
library(lubridate)
library(latex2exp)

# Suppress warnings
options(warn=-1)

#############################################################################
#_______________________________ Directories _______________________________#
#############################################################################

home.dir <- "~path-to-data-appendix/3_Analysis_of_TAQ_data"
data.dir <- file.path(home.dir, 'WRDS_Server/Output')
fig.dir <- file.path(home.dir, 'Final_Output/Figures')
table.dir <- file.path(home.dir, 'Final_Output/Tables')

###########################################################################
#_______________________________ Functions _______________________________#
###########################################################################

#_______________________________ Plot Functions _______________________________#

# plot.depth_volume
# Plots the depth and volume shares for each exchange on the x and y axis
#
# @return:
#   Figure plotting the depth share and volume share for exchanges on all symbol-date-intervals in the sample.
#
# @args (for reproducing results):
#   data: df, a dataframe that contains the depth shares and volume shares.
#     At the symbol-date(interval)-exchange level.
#   df_label: df, a dataframe with a row for each exchange included in this figure,
#     which contains a label for the exchange
#     along with (x, y) coordinates for each label.
#   figurePath: string, path to the directory where this figure will be exported to.
#   x: string, the variable in "data" used as the x variable.
#     This is usually depth share, but the exact variable depends
#     on whether the shares are calculated from the main maker-taker exchanges
#     or the Top 8.
#   y: string, the variable in "data" used as the y variable.
#     This is usually volume share, but the exact variable depends
#     on whether the shares are calculated from the main maker-taker exchanges
#     or the Top 8.
#   group_by: string that determines which variable is used
#     to color the points in the figure. Default is exchange,
#     so all observations by the same exchange will have the same color.
#   xlab: string, label for the x axis.
#   ylab: string, label for the y axis.
#   title: string, title of the plot.
#   lim: num_vector, vector containing the limits for both axes.
#   line_color: string, specifies the color of the 45 degree line.
#   line_size: float, specifies the size of the 45 degree line.
#   line_type: string, specifies the line type of the 45 degree line.
#   slope: float, specifies the slope of the 45 degree line
#     (by definition, we use a slope of 1).
#   axis_size: float, specifies the size of the axis labels.
#   exchange_label_size: float, specifies the size of the exchange labels.
#
plot.depth_volume <- function(data,  df_label, figure_path = "", x = "", y = "", group_by = "exchange",
                              xlab = "Depth Share", ylab = "Volume Share", title = "", lim = c(0,1),
                              exchange_priority = NA,
                              exchange_color_order, color_palette,
                              background = FALSE, line_color = "black",
                              line_size = 0.7, line_type = "solid", slope = 1,
                              axis_size = 31,
                              exchange_label_size = 7,
                              f_legend = FALSE) {

  # Generate figure.

  # for (ex in exchange_priority) {
  #     g <- g + geom_point(data = subset(data, exchange == ex), size = 2, aes_string(x = x, y = y, color = group_by))
  # }

  g <- ggplot(data, aes_string(x = x, y = y, color = group_by)) +
    geom_point(size = 2) +
    labs(title = paste(title), x = xlab, y = ylab, color = "Exchange") +
    scale_x_continuous(breaks = seq(0,1,0.2), limits = lim) +
    scale_y_continuous(breaks = seq(0,1,0.2), limits = lim) +
    scale_colour_manual(drop = TRUE,values = color_palette) +
    geom_abline(intercept = 0, slope = slope, size = line_size, linetype = line_type, color = line_color)

  if (f_legend == TRUE) {
      g <- g + theme(axis.text.x = element_text(size = axis_size),
                     axis.text.y = element_text(size = axis_size),
                     axis.title.x = element_text(size = axis_size),
                     axis.title.y = element_text(size = axis_size),
                     legend.title=element_text(size=25),
                     legend.text=element_text(size=20),
                     legend.key.size = unit(1, "cm"),
                     legend.position = c(0.862, 0.15)) +
                guides(col = guide_legend(override.aes = list(shape = 15, size = 10)))
  }
  else {
      g <- g + theme(axis.text.x = element_text(size = axis_size),
                     axis.text.y = element_text(size = axis_size),
                     axis.title.x = element_text(size = axis_size),
                     axis.title.y = element_text(size = axis_size),
                     title = element_text(size = 27),
                     legend.position = "none") +
               geom_text(data = df_label,
                         aes(x = x, y = y, label = label, color = factor(label)),
                         size = exchange_label_size)
  }

  if (background == TRUE) {
      g <- g + theme(panel.grid.major = element_blank(),
                     panel.grid.minor = element_blank(),
                     panel.background = element_blank(),
                     panel.border = element_rect(size = 1.5, fill = NA))
  }

  # Save figure.
  ggsave(file=figure_path, plot = g, width = 12, height= 12, dpi=120)

}

# plot.depth_volume_density
# Plots the density plot of the coefficients and R2s of the symbol-level
# depth-volume regressions.
#
# @return:
#   Density plots describing the results (coefficient/R2) of the symbol-level
#   depth-volume regressions.
#
# @args (for reproducing results):
#   data: df, a dataframe that contains the depth shares and volume shares.
#     At the symbol-date(interval)-exchange level.
#   df_label: df, a dataframe with a row for each exchange included in this figure,
#     which contains a label for the exchange
#     along with (x, y) coordinates for each label.
#   figurePath: string, path to the directory where this figure will be exported to.
#   x: string, the variable in "data" used as the x variable.
#     This is usually depth share, but the exact variable depends
#     on whether the shares are calculated from the main maker-taker exchanges
#     or the Top 8.
#   xlab: string, label for the x axis.
#   ylab: string, label for the y axis.
#   title: string, title of the plot.
#   xlim: num_vector, vector containing the limits of the x-axis
#   breakpoint: float, the spacing between ticks in the x-axis
#   vertical_line: float, location on the x-axis for the vertical, blue-dotted
#     line. In this setting, this is the value of the coefficient or the R2
#     of the blue-dotted line.
#
plot.depth_volume_density <- function(data, figure_path, x = "", xlab = "", ylab = "", title = "",
                                      xlim = c(), breakpoint = 0.1, vertical_line = 0){

  # Produces the density plot
  # Note: "y=..density.." plots the density instead of the count.
  g <- ggplot(data, aes_string(x = x)) +
    geom_density(aes(y=..density..), alpha = 1, fill = "grey", size = 0.8) +
    labs(title = title, x = xlab, y = ylab) +
    theme(axis.text.x = element_text(size = 22), axis.text.y = element_text(size = 22), axis.title.x = element_text(size = 22),
          axis.title.y = element_text(size = 22), title = element_text(size = 27))

  # If the breakpoint argument is greater than 0, we space the ticks
  # of the x-axis by the amount specified in breakpoint.
  if(length(xlim) > 0 & breakpoint > 0){
    g <- g + scale_x_continuous(limits = xlim, breaks = seq(xlim[1],xlim[2],breakpoint))
  }
  else if(length(xlim) > 0 ){
    g <- g + scale_x_continuous(limits = xlim)
  }

  # Add a vertical line at the location on the x-axis specified by "vertical_line".
  vertical_line <- as.numeric(as.character(vertical_line))
  if(vertical_line > 0){
    g <- g + geom_vline(xintercept = vertical_line, size = 1.0, linetype = "dashed", color = "blue")
  }

  # Save the figure.
  ggsave(file=figure_path,
         plot = g, width = 12, height= 8.8, dpi=120)
}

# plot.depth_volume_histogram
# Plots the histogram of the coefficients and R2s of the symbol-level
# depth-volume regressions.
#
# @return:
#   Histograms describing the results (coefficient/R2) of the symbol-level
#   depth-volume regressions.
#
# @args (for reproducing results):
#   data: df, a dataframe that contains the depth shares and volume shares.
#     At the symbol-date(interval)-exchange level.
#   df_label: df, a dataframe with a row for each exchange included in this figure,
#     which contains a label for the exchange
#     along with (x, y) coordinates for each label.
#   figurePath: string, path to the directory where this figure will be exported to.
#   x: string, the variable in "data" used as the x variable.
#     This is usually depth share, but the exact variable depends
#     on whether the shares are calculated from the main maker-taker exchanges
#     or the Top 8.
#   xlab: string, label for the x axis.
#   ylab: string, label for the y axis.
#   title: string, title of the plot.
#   xlim: num_vector, vector containing the limits of the x-axis
#   breakpoint: float, the spacing between ticks in the x-axis
#   vertical_line: float, location on the x-axis for the vertical, blue-dotted
#     line. In this setting, this is the value of the coefficient or the R2
#     of the blue-dotted line.
#
plot.depth_volume_histogram <- function(data, figure_path, x = "", xlab = "", ylab = "", title = "",
                                      binwidth = 0.01, xlim = c(), breakpoint = 0.1, vertical_line = 0){

  # Produces the histogram
  # Note: "y=..density.." plots the density instead of the count.
  # Density is used here in order to compare with the density plots, in order
  # to see how much smoothing is involved with the density plots.
  g <- ggplot(data, aes_string(x = x)) +
    geom_histogram(aes(y=..density..), colour="black", binwidth = binwidth) +
    labs(title = title, x = xlab, y = ylab) +
    theme(axis.text.x = element_text(size = 22), axis.text.y = element_text(size = 22), axis.title.x = element_text(size = 22),
          axis.title.y = element_text(size = 22), title = element_text(size = 27))

    # If the breakpoint argument is greater than 0, we space the ticks
    # of the x-axis by the amount specified in breakpoint.
    if(length(xlim) > 0 & breakpoint > 0){
      g <- g + scale_x_continuous(limits = xlim, breaks = seq(xlim[1],xlim[2],breakpoint))
    }
    else if(length(xlim) > 0 ){
      g <- g + scale_x_continuous(limits = xlim)
    }

  # Add a vertical line at the location on the x-axis specified by "vertical_line".
  vertical_line <- as.numeric(as.character(vertical_line))
  if(vertical_line > 0){
    g <- g + geom_vline(xintercept = vertical_line, size = 1.0, linetype = "dashed", color = "blue")
  }

  # Save the figure.
  ggsave(file=figure_path,
         plot = g, width = 12, height= 8.8, dpi=120)
}

#_______________________________ Regression Functions _______________________________#

# regression.R2
# Runs a regression between an x and y variable and reports the R2,
# which is calculated manually from the residuals and total sum of squares.
#
# We calculate the R2 manually because when the R2 function from summary.lm
# calculates the total sum of squares it calculates the sum of squares without
# subtracting the mean when the intercept is suppressed.
#
# In other words it calculates sum(volume_share^2) instead of sum((volume_share -
# mean(volume_share))^2).
#
# This function also manually tests the hypothesis that the regression coefficient
# is equal to 1. We do this manually in order to cluster the standard errors
# on symbol and exchange.
#
# @return:
#   output: a datframe with a row for the coefficient, standard error, and R2 of the regression
#
# @args (for reproducing results):
#   data: df, a dataframe that contains the depth shares and volume shares.
#     At the symbol-interval-exchange level.
#   x: string, the name of the independent variable.
#   y: string, the name of the dependent variable.
#   digits: int, the number of digits used to round the regression output
#

depth_volume_regression <- function(data, x, y, digits, cluster = TRUE) {

  # Remove rows that are missing depth share and volume share.
  # In our data, this occurs for observations of NYSE on non-NYSE listed
  # symbols. See below for more details in "Plot Depth Volume for Top 8 exchanges".
  data_no_NA <- data[!is.na(data[[x]]) & !is.na(data[[y]]),]

  # Runs the regression, suppresses the intercept.
  formula <- as.formula(paste0(y," ~ 0 + ",x))
  fit <- lm(formula, data_no_NA)
  sm <- summary(fit)

  # Extract regression coefficient.
  coefficient <- fit$coef[x]

  # Extract standard error, clustering by symbol and exchange.
  # "cluster.vcov" returns a number (not a matrix) since we only have 1
  # variable. We include "diag" as a convention.
  if (cluster) {
    se <- sqrt(diag(cluster.vcov(fit, ~ symbol + EX)))
  } else {
    se <- sm$sigma
  }


  # Obtain t statistic of the hypothesis that the coefficient is statistcally indistinguishable from 1.
  t_stat <- (1 - coefficient) / se
  t_stat <- abs(t_stat)

  # Calculate the p-value for this t statistic.
  # We take the absolute value of the t-statistic to make it positive.
  # Then we calculate the probability P(X > t_stat) with "pt", which is the t distribution CDF.
  # We set lower.tail to False to get P(X > t_stat) instead of P(X < t_stat).
  # Then we multiply this probability by 2 to get the two-tailed test p-value.
  p_value <- 2 * pt(t_stat, df = nobs(fit) - 1, lower.tail = FALSE)

  # Calculate R2.
  # We calculate the residual sum of squares from the residuals in the summary output,
  # and we calculate the total sum of squares directly using the mean.
  resid <- sm$residuals
  dif_mean <- data_no_NA[,y] - mean(data_no_NA[,y])
  R2 <- 1 - sum(resid^2)/sum(dif_mean^2)

  # Labels for the output.
  reg_output_labels <- c("coefficient", "standard error", "p-value", "R2")

  # Round regression output.
  reg_output <- c(format(round(coefficient, digits), nsmall = digits),
                  format(round(se, digits), nsmall = digits),
                  format(round(p_value, digits), nsmall = digits),
                  format(round(R2, digits), nsmall = digits))

  # Store regression output as a dataframe.
  # output <- cbind(as.matrix(reg_output_labels), as.matrix(reg_output))
  # output <- data.frame(output)
  #
  # # Rename column names and row names.
  # colnames(output) <- c("Label", "Result")
  # rownames(output) <- 1:nrow(output)

  output <- data.frame(Label = reg_output_labels, Result = reg_output)
  return(output)
}

# symbol_depth_volume_regressions
# For each symbol, this function runs a regression between an x and y variable
# (volume share and depth share in this setting) for the observations containing
# that symbol.
#
# This function saves the coefficient, R2, standard error of the coefficient,
# and the p-value for the t-test of the hypothesis that the coefficient is
# different from 1, for each regression.
#
# We calculate the R2 manually because when the R2 function from summary.lm
# calculates the total sum of squares it calculates the sum of squares without
# subtracting the mean when the intercept is suppressed. In other words,
# the built-in method calculates sum(volume_share^2) instead of sum((volume_share -
# mean(volume_share))^2), which is what we use.
#
# @return:
#   output: a dataframe with a row for each symbol containing the coefficient
#     and R2 of the depth-volume regression for that symbol.
#
# @args (for reproducing results):
#   data: df, a dataframe that contains the depth shares and volume shares.
#     At the symbol-interval-exchange level.
#   x: string, the name of the independent variable.
#   y: string, the name of the dependent variable.
#   digits: int, the number of digits used to round the regression output
#
symbol_depth_volume_regressions <- function(data, x = "", y = "") {

  # Remove rows that are missing depth share and volume share.
  # In our data, this occurs for observations of NYSE on non-NYSE listed
  # symbols. See below for more details in "Plot Depth Volume for Top 8 exchanges".
  data_no_NA <- data[!is.na(data[,x]) & !is.na(data[,y]),]

  # Extract symbols from data
  symbols <- unique(data$symbol)

  # The formula of the depth-volume regression is:
  # depth_share ~ volume_share + eps (intercept is suppressed)
  formula <- as.formula(paste0(y,"~ 0 + ",x))

  # Empty vectors to store results
  coefs <- c()
  std_error <- c()
  p_value <- c()
  R2 <- c()

  for(symbol in symbols){
    # Extract subset of the observations involving this symbol.
    symbol_data <- data_no_NA[data_no_NA$symbol == symbol,]

    # Run the depth volume regression for this subset.
    fit <- lm(formula, symbol_data)

    # Extract residuals.
    sm <- summary(fit)
    resid <- sm$residuals

    # Extract coefficient.
    coef <- fit$coef

    # Extract standard error, clustered by exchange.
    # "cluster.vcov" returns a number (not a matrix) since we only have 1
    # variable. We include "diag" as a convention.
    se <- sqrt(diag(cluster.vcov(fit, ~ exchange)))

    # Obtain t statistic of the hypothesis that the coefficient is statistically
    # indistinguishable from 1.
    t_stat <- (1 - coef) / se
    t_stat <- abs(t_stat)

    # Calculate the p-value for this t statistic.
    # We take the absolute value of the t-statistic to make it positive.
    # Then we calculate the probability P(X > t_stat) with "pt", which is the t distribution CDF.
    # We set lower.tail to False to get P(X > t_stat) instead of P(X < t_stat).
    # Then we multiply this probability by 2 to get the two-tailed test p-value.
    p_symbol <- 2 * pt(t_stat, df = nobs(fit) - 1, lower.tail = FALSE)

    # Append the coefficient, standard errors and p-values to the storage vectors.
    coefs <- c(coefs, coef)
    std_error <- c(std_error, se)
    p_value <- c(p_value, p_symbol)

    # Calculate R2.
    # We calculate the residual sum of squares from the residuals in the summary output,
    # and we calculate the total sum of squares directly using the mean.
    dif_mean <- symbol_data[,y] - mean(symbol_data[,y])
    R2_symbol <- 1 - sum(resid^2)/sum(dif_mean^2)

    # Append the coefficient and R2 to the storage vectors.
    R2 <- c(R2, R2_symbol)
  }

  # Output a dataframe with a row for each symbol containing the coefficient and R2.
  output <- data.frame(symbol = symbols, coef = coefs, R2 = R2, se = std_error, p_value = p_value)
  return(output)
}

# make_hypothesis_test_table
# This function uses the symbol depth-volume regression output produced by
# symbol_depth_volume_regressions for the datasets where the shares are computed
# from the top 8 exchanges and the top 5 exchanges.
#
# This function counts how many symbols have a coefficient p-value (p-value
# of the test of the hypothesis H0: coefficient = 1) less than 0.1, 0.05, and 0.01.
# Then we save and export a TeX table with these counts.
#
# @return:
#   tex: a character string that contains the table reporting the hypothesis
#     rejection counts in TeX.
#
# @args (for reproducing results):
#   top5_reg_results: df, the symbol-level dataset containing the depth-volume
#     regression results for all symbols in the top 100. The depth-volume
#     regressions used the volume and depth shares computed among the top 5
#     exchanges
#   top8_reg_results: df, the symbol-level dataset containing the depth-volume
#     regression results for all symbols in the top 100. The depth-volume
#     regressions used the volume and depth shares computed among the top 8
#     exchanges
#   table_path: string, file path for the table.
#
make_hypothesis_test_table <- function(top5_reg_results, top8_reg_results, table_path){

  top5_reg_results <- sym_depth_volume_reg_top5_t100
  top8_reg_results <- sym_depth_volume_reg_top8_t100
  # Find how many symbols rejected at 90, 95, and 99 percent level
  # for the depth-volume regressions with shares computed among the top 8 exchanges
  num_symbols_90_pct_top_8 <- length(top8_reg_results[top8_reg_results$p_value  <= 0.1,]$symbol)
  num_symbols_95_pct_top_8 <- length(top8_reg_results[top8_reg_results$p_value  <= 0.05,]$symbol)
  num_symbols_99_pct_top_8 <- length(top8_reg_results[top8_reg_results$p_value  <= 0.01,]$symbol)

  # Save counts
  top_8_results <- c(num_symbols_90_pct_top_8, num_symbols_95_pct_top_8, num_symbols_99_pct_top_8)

  # Find how many symbols rejected at 90, 95, and 99 percent level
  # for the depth-volume regressions with shares computed among the top 5 exchanges
  num_symbols_90_pct_top_5 <- length(top5_reg_results[top5_reg_results$p_value  <= 0.1,]$symbol)
  num_symbols_95_pct_top_5 <- length(top5_reg_results[top5_reg_results$p_value  <= 0.05,]$symbol)
  num_symbols_99_pct_top_5 <- length(top5_reg_results[top5_reg_results$p_value  <= 0.01,]$symbol)

  # Save counts
  top_5_results <- c(num_symbols_90_pct_top_5, num_symbols_95_pct_top_5, num_symbols_99_pct_top_5)

  # Store in a dataframe
  hypothesis_results <- data.frame(top_8 = top_8_results, top_5 = top_5_results)

  # Print TeX table with the counts
  rejection_labels <- c("Rejected at the 90 percent level", "Rejected at the 95 percent level", "Rejected at the 99 percent level")
  tex <- character(nrow(hypothesis_results))
  for(i in 1:nrow(hypothesis_results)){
    tex[i] <- paste(rejection_labels[i], paste(hypothesis_results[i,], collapse = " & "), sep = " & ")
    tex[i] <- paste(tex[i], " \\\\ ")
  }

  # Print the TeX string into a sink
  sink(file = table_path)
  cat(tex)
  sink()
}

# depth_volume_interval
# This function performs the depth-volume routine (plot and regressions)
# for depth-volume shares at different time levels.
#
# @return:
#   Figure plotting the depth share and volume share for exchanges on all
#     symbol-date-intervals in the sample.
#   output: a dataframe with a row for each symbol containing the coefficient
#     and R2 of the depth-volume regression for that symbol.
#
# @args (for reproducing results):
#   df_file_name: string, file name (including directory location) of the depth-
#     volume dataset.
#   interval: string, name of the time interval of depth-volume shares. Used
#     for filenames of outputs.
#   cluster: bool, flag which specifies whether the depth-volume regression
#     will have clustered standard errors.
#
depth_volume_interval <- function(df_file_name,
                                  interval,
                                  figure_path = file.path(fig.dir, "Depth Volume Intervals"),
                                  cluster = TRUE) {

  # Exchange code
  exchange_code = c('N','T','P','Z','K')
  exchange_name = c("NYSE","NASDAQ","NYSE Arca","BATS","EDGX")

  # Labels
  main_maker_exchanges <- c("EDGX","NYSE","NYSE Arca","NASDAQ","BATS")
  df_label_main_maker <- data.frame(label = main_maker_exchanges,
                                    x = c(0.2, 0.55, 0.57, 0.75, 0.18),
                                    y = c(0.51, 0.25, 0.71, 0.54, 0.00))

  # BATS, EDGX, NYSE, Arca, Nasdaq
  exchange_priority <- c("BATS", "EDGX", "NYSE", "NYSE Arca","NASDAQ")
  color_palette     <- c("green3", "#619CFF", "navy", "purple", "forest green")

  ## Load the dataset
  depth_volume_df <- data.table::fread(df_file_name)
  depth_volume_df <- data.frame(depth_volume_df)

  # Map exchange names to exchange codes
  depth_volume_df$exchange <- plyr::mapvalues(depth_volume_df$EX, from = exchange_code, to = exchange_name)
  #depth_volume$exchange <- as.factor(depth_volume$exchange)
  depth_volume_df$exchange <- factor(depth_volume_df$exchange, levels = exchange_priority)
  # Change date to the R date format
  depth_volume_df$DATE <- as.Date(as.character(depth_volume_df$DATE), format = "%Y-%m-%d")

  # Restrict to the top 5 main maker-taker exchanges, top 100 symbols,
  depth_volume_df_main_maker <- depth_volume_df[depth_volume_df$f_Main_Maker == 1 & depth_volume_df$f_Sample_T100 == 1,]

  # Order points according to the order outlined in "exchange_priority"
  depth_volume_df_main_maker <- depth_volume_df_main_maker[order(depth_volume_df_main_maker$exchange),]

  # Randomize edgx nasdaq and bats
  nsdq_edgx_bats_only <- depth_volume_df_main_maker[depth_volume_df_main_maker$exchange %in% c("NASDAQ", "BATS", "EDGX"),]
  depth_volume_df_main_maker[depth_volume_df_main_maker$exchange %in% c("NASDAQ", "BATS", "EDGX"),] <- nsdq_edgx_bats_only[order(sample(nrow(nsdq_edgx_bats_only))),]

  ## Plot
  plot_name <- paste0("depthVolumeShare_main5_", interval, ".png")

  plot.depth_volume(data = depth_volume_df_main_maker,
                    df_label = df_label_main_maker,
                    figure_path = file.path(figure_path, plot_name),
                    exchange_color_order = ex_color,
                    color_palette = color_palette,
                    background = FALSE,
                    lim = c(0, 1),
                    exchange_label_size = 7,
                    x = "depth_share_main_maker",
                    y = "volume_share_main_maker")

  ## Run Regressions
  print(paste0("Depth-Volume Regression (Top 5 shares): ", interval))
  pooled_depth_volume_top5_t100 <- depth_volume_regression(depth_volume_df_main_maker, "depth_share_main_maker", "volume_share_main_maker", digits = 3, cluster = cluster)
  print(pooled_depth_volume_top5_t100)
  cat("\n")

  # Clear memory
  rm(depth_volume_df)
}

############################################################################
#_______________________________ Top 5 Plot _______________________________#
############################################################################

# Set random seed
set.seed(1)

# Sink R print output to text
sink(file.path(home.dir, "Final_Output/Logs/Depth_Volume_log.txt"))

# Import Depth Volume.
depth_volume <- data.table::fread(file.path(data.dir, "Depth_Volume_Shares_Top5_2015.csv"))
depth_volume <- data.frame(depth_volume)

# ________________________ Hardcoded values for plotting ________________________

# Exchange code
exchange_code = c('N','T','P','Z','K')
exchange_name = c("NYSE","NASDAQ","NYSE Arca","BATS","EDGX")

# Labels
main_maker_exchanges <- c("EDGX","NYSE","NYSE Arca","NASDAQ","BATS")
df_label_main_maker <- data.frame(label = main_maker_exchanges,
                                  x = c(0.2, 0.55, 0.57, 0.75, 0.18),
                                  y = c(0.51, 0.25, 0.71, 0.54, 0.00))

# BATS, EDGX, NYSE, Arca, Nasdaq
exchange_priority <- c("BATS", "EDGX", "NYSE", "NYSE Arca", "NASDAQ")
color_palette     <- c("green3", "#619CFF", "navy", "purple", "forest green")

# ________________________ Clean data and layer points ________________________

# Map exchange names to exchange codes
depth_volume$exchange <- plyr::mapvalues(depth_volume$EX, from = exchange_code, to = exchange_name)
#depth_volume$exchange <- as.factor(depth_volume$exchange)
depth_volume$exchange <- factor(depth_volume$exchange, levels = exchange_priority)
# Change date to the R date format
depth_volume$DATE <- as.Date(as.character(depth_volume$DATE), format = "%Y-%m-%d")

# Restrict to the top 5 main maker-taker exchanges, top 100 symbols,
depth_volume_main_maker_T100 <- depth_volume[depth_volume$f_Main_Maker == 1 & depth_volume$f_Sample_T100 == 1,]

# Order points according to the order outlined in "exchange_priority"
depth_volume_main_maker_T100 <- depth_volume_main_maker_T100[order(depth_volume_main_maker_T100$exchange),]

# Randomize edgx nasdaq and bats
nsdq_edgx_bats_only <- depth_volume_main_maker_T100[depth_volume_main_maker_T100$exchange %in% c("NASDAQ", "BATS", "EDGX"),]
depth_volume_main_maker_T100[depth_volume_main_maker_T100$exchange %in% c("NASDAQ", "BATS", "EDGX"),] <- nsdq_edgx_bats_only[order(sample(nrow(nsdq_edgx_bats_only))),]

######################################################################
#_______________________________ Plot _______________________________#
######################################################################

# Top 5 shares
plot.depth_volume(data = depth_volume_main_maker_T100,
                  df_label = df_label_main_maker,
                  figure_path = file.path(fig.dir, "depth_volume_main_maker_t100.png"),
                  exchange_color_order = ex_color,
                  color_palette = color_palette,
                  background = FALSE,
                  lim = c(0, 1),
                  exchange_label_size = 7,
                  x = "depth_share_main_maker",
                  y = "volume_share_main_maker")

#############################################################################
#_______________________________ Regressions _______________________________#
#############################################################################

cat("*** Depth-volume shares regression ***", "\n")
cat("\n")

# Top 5 Main Maker-Taker exchanges, top 100 symbols

print("We regress volume shares on depth shares for the Top 5 maker-taker exchanges.")
print("The shares are computed among the Top 5, and not the Top 8.")
print("We report the coefficient, the standard error, and the R2 of the regression.")
print("We also test the null hypothesis that the coefficient of this regression = 1 and report the p-value.")
pooled_depth_volume_top5_t100 <- depth_volume_regression(depth_volume_main_maker_T100, "depth_share_main_maker", "volume_share_main_maker", digits = 3)
pooled_depth_volume_top5_t100
cat("\n")

##########################################################################################
#_______________________________ Symbol-level Regressions _______________________________#
##########################################################################################

cat("*** Symbol-level depth-volume shares regressions ***", "\n")
cat("\n")

# Top 5 Main Maker-Taker exchanges
sym_depth_volume_reg_top5_t100 <- symbol_depth_volume_regressions(depth_volume_main_maker_T100, "depth_share_main_maker", "volume_share_main_maker")
data.table::fwrite(sym_depth_volume_reg_top5_t100, file.path(table.dir,"sym_depth_volume_reg_top5_t100.csv"))

# Report mean and standard deviation of the coefficients
print("Symbol Depth Volume (Top 5 shares).")
print(paste0("Mean of Coef: ", mean(sym_depth_volume_reg_top5_t100$coef)))
print(paste0("SD of Coef: ", sd(sym_depth_volume_reg_top5_t100$coef)))

# Mean and standard deviation of the R2s
print(paste0("Mean of R2: ", mean(sym_depth_volume_reg_top5_t100$R2)))
print(paste0("SD of R2: ", sd(sym_depth_volume_reg_top5_t100$R2)))
cat("\n")

##########################################################################################################
#_______________________________ Depth-Volume at different time intervals _______________________________#
##########################################################################################################

cat("*** Depth-volume shares regressions, different time intervals ***", "\n")
cat("\n")

# Robustness checks, Top 100 symbols
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/Depth_Volume_Shares_5min_2015_Top5.csv"), "T100_5min")
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/Depth_Volume_Shares_10min_2015_Top5.csv"), "T100_10min")
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/Depth_Volume_Shares_30min_2015_Top5.csv"), "T100_30min")
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/Depth_Volume_Shares_1hr_2015_Top5.csv"), "T100_1hr")

# Robustness checks, SPY
# Cluster is set to false since there is only one symbols (we normally cluster on symbol and exchange).
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/SPY_Depth_Volume_Shares_1sec_2015_Top5.csv"), "SPY_1sec", cluster = FALSE)
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/SPY_Depth_Volume_Shares_10sec_2015_Top5.csv"), "SPY_10sec", cluster = FALSE)
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/SPY_Depth_Volume_Shares_30sec_2015_Top5.csv"), "SPY_30sec", cluster = FALSE)
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/SPY_Depth_Volume_Shares_1min_2015_Top5.csv"), "SPY_1min", cluster = FALSE)
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/SPY_Depth_Volume_Shares_5min_2015_Top5.csv"), "SPY_5min", cluster = FALSE)
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/SPY_Depth_Volume_Shares_10min_2015_Top5.csv"), "SPY_10min", cluster = FALSE)
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/SPY_Depth_Volume_Shares_30min_2015_Top5.csv"), "SPY_30min", cluster = FALSE)
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/SPY_Depth_Volume_Shares_1hr_2015_Top5.csv"), "SPY_1hr", cluster = FALSE)
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/SPY_Depth_Volume_Shares_2hr_2015_Top5.csv"), "SPY_2hr", cluster = FALSE)
depth_volume_interval(file.path(data.dir, "Depth Volume Intervals/SPY_Depth_Volume_Shares_6hr_2015_Top5.csv"), "SPY_6hr", cluster = FALSE)

sink()
